{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "aa821f27",
   "metadata": {},
   "source": [
    "## Notebook 2 - Reference MS2 database"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ac411b88",
   "metadata": {},
   "source": [
    "This notebook builds the MS2 reference database based on MoNA. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1ecd8fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run ../common.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3d8eb5ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_substrates = pd.read_csv('./tmp/Substrates_VB.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "19f4e230",
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdkit import Chem\n",
    "from rdkit.Chem import Descriptors\n",
    "from rdkit.Chem import rdMolDescriptors\n",
    "import requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "150f4de9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def inchikey_to_inchi(inchikey):\n",
    "    url = f'https://cactus.nci.nih.gov/chemical/structure/{inchikey}/stdinchi'\n",
    "    response = requests.get(url)\n",
    "    if response.status_code == 200:\n",
    "        return response.text.strip()\n",
    "    else:\n",
    "        raise ValueError(f\"Could not retrieve InChI for InChIKey: {inchikey}\")\n",
    "\n",
    "def get_exact_mass_from_inchikey(inchikey):\n",
    "    inchi = inchikey_to_inchi(inchikey)\n",
    "    molecule = Chem.MolFromInchi(inchi)\n",
    "    exact_mass = rdMolDescriptors.CalcExactMolWt(molecule)\n",
    "    return exact_mass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "352a3b90",
   "metadata": {},
   "outputs": [],
   "source": [
    "substrates_iks = set(df_substrates['ik_MoNA'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "50c078b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_text_block(block):\n",
    "    data = {}\n",
    "    lines = block.strip().split('\\n')\n",
    "    i = 0\n",
    "    while i < len(lines):\n",
    "        line = lines[i].strip()\n",
    "        if line.startswith(\"Name:\"):\n",
    "            data['Name'] = line.split(':', 1)[1].strip()\n",
    "        #elif line.startswith(\"Synon:\"):\n",
    "        #    data['Synon'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"DB#:\"):\n",
    "            data['DB#'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"InChIKey:\"):\n",
    "            data['InChIKey'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"Precursor_type:\"):\n",
    "            data['Precursor_type'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"Spectrum_type:\"):\n",
    "            data['Spectrum_type'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"PrecursorMZ:\"):\n",
    "            data['PrecursorMZ'] = float(line.split(':', 1)[1].strip())\n",
    "        elif line.startswith(\"Instrument_type:\"):\n",
    "            data['Instrument_type'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"Instrument:\"):\n",
    "            pass\n",
    "        #    data['Instrument'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"Ion_mode:\"):\n",
    "            data['Ion_mode'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"Collision_energy:\"):\n",
    "            pass\n",
    "        #    data['Collision_energy'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"Formula:\"):\n",
    "            data['Formula'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"MW:\"):\n",
    "            pass\n",
    "        #    data['MW'] = float(line.split(':', 1)[1].strip())\n",
    "        elif line.startswith(\"ExactMass:\"):\n",
    "            data['ExactMass'] = float(line.split(':', 1)[1].strip())\n",
    "        elif line.startswith(\"Comments:\"):\n",
    "            pass\n",
    "        #    data['Comments'] = line.split(':', 1)[1].strip()\n",
    "        elif line.startswith(\"Num Peaks:\"):\n",
    "            data['Num Peaks'] = int(line.split(':', 1)[1].strip())\n",
    "            mz = []\n",
    "            relint = []\n",
    "            for _ in range(data['Num Peaks']):\n",
    "                i += 1\n",
    "                peak_line = lines[i].strip()\n",
    "                parts = peak_line.split()\n",
    "                mz.append(float(parts[0]))\n",
    "                relint.append(float(parts[1]))\n",
    "\n",
    "            \n",
    "            # Normalize intensities to 1\n",
    "            relint = relint / np.max(relint)\n",
    "\n",
    "            # Keep only the top 50 peaks based on intensity\n",
    "            if len(relint) > 50:\n",
    "                top_50_idx = np.argsort(relint)[-50:]\n",
    "                mz = np.array(mz)[top_50_idx]\n",
    "                relint = relint[top_50_idx]\n",
    "\n",
    "            # Sort the arrays from low to high mz. Sorting is needed by matchms\n",
    "            idx = np.argsort(mz)\n",
    "            mz = np.array(mz)[idx]\n",
    "            relint = np.array(relint)[idx]\n",
    "\n",
    "            # Save entry only if there are at least 5 peaks with relint >= 0.02\n",
    "            if np.sum(relint >= 0.02) < 5:\n",
    "                return None\n",
    "\n",
    "            data['mz'] = mz\n",
    "            data['relint'] = relint\n",
    "        i += 1\n",
    "\n",
    "    # Perform checks\n",
    "    if ('InChIKey' in data \n",
    "        #and 'ExactMass' in data and\n",
    "        #data.get('Ion_mode') == 'P' and \n",
    "        #'ESI' in data.get('Instrument_type', '') and \n",
    "        #data.get('Spectrum_type') == 'MS2'\n",
    "        ):\n",
    "        \n",
    "        data['ik_MoNA'] = data['InChIKey'].split('-')[0]\n",
    "\n",
    "        if data['ik_MoNA'] not in substrates_iks:\n",
    "            return None\n",
    "\n",
    "        # Perform the exact mass check\n",
    "        # inchi_key = data['InChIKey']\n",
    "        # try:\n",
    "        #     calc_exact_mass = get_exact_mass_from_inchikey(inchi_key)\n",
    "        #     massdif = abs(calc_exact_mass - data['ExactMass'])\n",
    "        #     if massdif > 0.5:  # Tolerance for floating-point comparison\n",
    "        #         print(f'Exact mass mismatch: {massdif} amu')\n",
    "        #         return None\n",
    "        # except:\n",
    "        #     return None\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "    \n",
    "    return data\n",
    "\n",
    "def parse_file(file_path):\n",
    "    with open(file_path, 'r') as file:\n",
    "        content = file.read()\n",
    "    blocks = content.split('\\n\\n')\n",
    "    \n",
    "    parsed_data = []\n",
    "    for block in tqdm(blocks, desc=\"Parsing blocks\"):\n",
    "        parsed_block = parse_text_block(block)\n",
    "        if parsed_block is not None:\n",
    "            parsed_data.append(parsed_block)\n",
    "    \n",
    "    return parsed_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e89be22c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Parsing blocks: 100%|██████████| 99261/99261 [00:11<00:00, 8594.72it/s] \n"
     ]
    }
   ],
   "source": [
    "file_path = '../data/MoNA/MoNA-export-LC-MS-MS_Positive_Mode_26May2024.msp'\n",
    "parsed_data = parse_file(file_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c82e1107",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp = pd.DataFrame(parsed_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "4fcccd78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(6797, 14)\n"
     ]
    }
   ],
   "source": [
    "print(df_tmp.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c2d5b3f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "43b84f84",
   "metadata": {},
   "source": [
    "We merge the dataframe `df_tmp` with the `df_substrates` dataframe:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "72e4c0ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df_substrates.merge(df_tmp[['ik_MoNA', 'relint', 'mz', 'DB#']], on=['ik_MoNA'], how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4bb2c466",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7450, 20)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b029493e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(filepath_results + 'MS2_database_shifts_162_320_VB.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e562cb3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_pickle(filepath_results + 'MS2_database_shifts_162_320_VB.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c2b54349",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp.to_csv('./tmp/parsed_mona.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6abd052c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "project_gt",
   "language": "python",
   "name": "project_gt"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
